rm(list=ls())

Load requisite packages and data

library(ezids)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyverse)
library (tidyr)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)
library(tigris)
library(sf)

This project uses the State and County Housing Market Indicators dataset from the American Enterprise Institute Housing Center, found here. The variables are:

Original Variable Name New Variable Name Definition
State State state
County County_Name County
FIPS FIPS_County_Code 5-digit Federal Information Processing Series codes (first 2 digits indicate state, last 3 indicate sub-county entity)
Year Year Year when the data was collected
Tier Affordability Categorizes home sales into entry-level (<=80th percentile of FHA sales prices), move-up (all others), and all
Median.Sale.Price..in.Thousands. Median_Sale_Price_in_k Median sale price in thousands of USD per county
House.Price.Appreciation.since.2012 House_Price_Appreciation_since_2012_percent Cumulative home price appreciation since 2012
House.Price.Appreciation..Year.over.Year House_Price_Appreciation_yr_over_yr_percent Home price appreciation since the previous year
Months..Supply Months_Supply Number of months it would take for the inventory of existing homes for sale to be exhausted at the current sales pace
New.Construction.Share.of.Sales New_Constr_by_share_of_sales_percent Percent of sales comprising new construction
Mortgage.Default.Rate Mortgage_Default_Rate_percent AEI Mortgage Default Rate, a measure of how loans originating in a given month would perform under the same conditions as the 2007 financial crisis (<=7%: Low Risk; between 7.01% and 14%: Medium Risk; >14%: High Risk)
housing = read.csv("/Users/ilgazkuscu/Documents/GitHub/housing-price-vs-supply-2024/Data/state_county_data_download_2025.csv")
housing %>% slice_sample(n=5)
##   State              County  FIPS Year       Tier
## 1    LA       Iberia Parish 22045 2022        all
## 2    NY     Dutchess County 36027 2017 entrylevel
## 3    KY       Estill County 21065 2024        all
## 4    MD Queen Anne'S County 24035 2019        all
## 5    MO    Jefferson County 29099 2013 entrylevel
##   Median.Sale.Price..in.Thousands. House.Price.Appreciation.since.2012
## 1                              128                   46.5000003576279%
## 2                              185                   12.5999987125397%
## 3                               90                   141.899991035461%
## 4                              327                   23.9999994635582%
## 5                              120                                  0%
##   House.Price.Appreciation..Year.over.Year. Months..Supply
## 1                         11.8000000715256%            2.3
## 2                         5.20000010728836%            6.1
## 3                         16.0999998450279%            1.0
## 4                         2.19999998807907%            5.4
## 5                                        0%            7.8
##   New.Construction.Share.of.Sales Mortgage.Default.Rate
## 1                1.7000000923872%     26.1000007390976%
## 2              0.700000021606684%      14.300000667572%
## 3               6.89999982714653%     16.8999999761581%
## 4               11.2000003457069%     13.3000001311302%
## 5               5.00000007450581%     17.7000001072884%

The data is limited to the year 2024 and cleaned of NA values, and the variables are renamed for clarity.

housing_2024 = housing %>% filter(housing$Year == 2024, 
                                  housing$State != 'AA National', 
                                  housing$County != 'AA State') %>%
  #moving this na removal to the end after the cleaning
  na.omit %>% 
  #excluding the armed forces


#rename cols
rename(
  Median_Sale_Price_per_k = Median.Sale.Price..in.Thousands.,
  House_Price_Appreciation_yr_over_yr_percent = House.Price.Appreciation..Year.over.Year.,
  House_Price_Appreciation_since_2012_percent = House.Price.Appreciation.since.2012,
  Months_Supply = Months..Supply,
  New_Constr_by_share_of_sales_percent = New.Construction.Share.of.Sales,
  Mortgage_Default_Rate_percent = Mortgage.Default.Rate,
  County_Name = County,
  FIPS_County_Code = FIPS, 
  Affordability = Tier
)
head(housing_2024)
##    State            County_Name FIPS_County_Code Year Affordability
## 7     AK Anchorage Municipality             2020 2024           all
## 8     AK Anchorage Municipality             2020 2024    entrylevel
## 9     AK Anchorage Municipality             2020 2024        moveup
## 10    AK     Bethel Census Area             2050 2024           all
## 11    AK     Bethel Census Area             2050 2024    entrylevel
## 12    AK     Bethel Census Area             2050 2024        moveup
##    Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 7                      414                           51.8999934196472%
## 8                      325                           49.6999979019165%
## 9                      581                           54.1999995708466%
## 10                     389                                            
## 11                     350                                            
## 12                     516                                            
##    House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 7                            4.50000017881393%           2.5
## 8                            3.99999991059303%           2.2
## 9                            5.00000007450581%           2.8
## 10                                                       4.8
## 11                                                       5.3
## 12                                                       3.2
##    New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 7                     4.39999997615814%             12.3999997973442%
## 8                     1.89999993890524%             13.6000007390976%
## 9                     7.90000036358833%             9.79999974370003%
## 10                    11.6999998688698%             16.2000000476837%
## 11                                   0%             4.30000014603138%
## 12                    60.9000027179718%             22.2000002861023%

The typing of the variables is also corrected. Some require the symbols “$” and “%” to be removed beforehand, so that is also done.

# as factors
housing_2024$State = as.factor(housing_2024$State)
housing_2024$County_Name = as.factor(housing_2024$County_Name)
housing_2024$FIPS_County_Code = as.factor(housing_2024$FIPS_County_Code)
housing_2024$Affordability = as.factor(housing_2024$Affordability)

# remove prefixes '$' and '%' from values
housing_2024 = housing_2024 %>%
  mutate(Median_Sale_Price_per_k = gsub("[\\$,]", "", Median_Sale_Price_per_k),
         House_Price_Appreciation_since_2012_percent =
           gsub("[%,]","",House_Price_Appreciation_since_2012_percent),
         House_Price_Appreciation_yr_over_yr_percent =
           gsub("[%,]","",House_Price_Appreciation_yr_over_yr_percent),
         New_Constr_by_share_of_sales_percent = gsub("[%,]","",New_Constr_by_share_of_sales_percent),
         Mortgage_Default_Rate_percent = gsub("[%,]","",Mortgage_Default_Rate_percent)
  )
###I included the commas to be removed as well indicated in the brackets. SA

# as num instead of chr
housing_2024$Median_Sale_Price_per_k = as.numeric(housing_2024$Median_Sale_Price_per_k)
housing_2024$House_Price_Appreciation_since_2012_percent =
  as.numeric(housing_2024$House_Price_Appreciation_since_2012_percent)
housing_2024$House_Price_Appreciation_yr_over_yr_percent =
  as.numeric(housing_2024$House_Price_Appreciation_yr_over_yr_percent)
housing_2024$New_Constr_by_share_of_sales_percent = 
  as.numeric(housing_2024$New_Constr_by_share_of_sales_percent)
housing_2024$Mortgage_Default_Rate_percent = as.numeric(housing_2024$Mortgage_Default_Rate_percent)
###I included the commas to be removed as well indicated in the brackets. SA

#moving the na dropping to after the parsing
housing_2024 <- housing_2024 %>%
  tidyr::drop_na(Median_Sale_Price_per_k)

# For some reason is rounding the data in quite a weird way—inaccurately

# view data
housing_2024 %>% slice_sample(n=5)
##   State       County_Name FIPS_County_Code Year Affordability
## 1    NC     Yancey County            37199 2024           all
## 2    WV  Pendleton County            54071 2024        moveup
## 3    IA Winneshiek County            19191 2024           all
## 4    TN      Lewis County            47101 2024           all
## 5    GA Meriwether County            13199 2024           all
##   Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1                     260                                       112.9
## 2                     284                                          NA
## 3                     228                                        77.4
## 4                     170                                       154.2
## 5                     220                                       144.2
##   House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1                                         2.5           4.5
## 2                                          NA          11.1
## 3                                         1.8           2.6
## 4                                         9.5           2.3
## 5                                         9.2           3.5
##   New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1                                  2.6                           9.1
## 2                                  0.0                            NA
## 3                                 10.1                           7.4
## 4                                 11.4                          18.0
## 5                                 19.4                          18.5
#creating df w/ affordability all totals and then another with entry level/moving up totals
housing_2024_all = housing_2024 %>% filter(housing_2024$Affordability == "all") 
housing_2024_tiers = housing_2024 %>% filter(Affordability == "entrylevel" | Affordability == "moveup")
#BW- we need to explain why we did this (not in the comments, in the write-up and presentation). Steph?

Exploratory Data Analysis (EDA)

xkablesummary(housing_2024)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 620 Washington County: 80 1001 : 3 Min. :2024 all :2751 Min. : 14 Min. : 3.7 Min. :-32.90 Min. : 0.00 Min. : 0.0 Min. : 0.2
Q1 GA : 477 Jefferson County : 72 1003 : 3 1st Qu.:2024 entrylevel:2741 1st Qu.: 148 1st Qu.: 86.6 1st Qu.: 2.80 1st Qu.: 2.20 1st Qu.: 2.7 1st Qu.:10.2
Median KY : 340 Franklin County : 63 1005 : 3 Median :2024 moveup :2669 Median : 260 Median :105.2 Median : 5.70 Median : 3.10 Median : 6.9 Median :13.9
Mean VA : 319 Lincoln County : 60 1007 : 3 Mean :2024 NA Mean : 293 Mean :110.7 Mean : 5.88 Mean : 4.11 Mean : 10.7 Mean :14.3
Q3 NC : 300 Jackson County : 57 1009 : 3 3rd Qu.:2024 NA 3rd Qu.: 391 3rd Qu.:129.6 3rd Qu.: 8.70 3rd Qu.: 5.00 3rd Qu.: 15.0 3rd Qu.:17.5
Max IA : 297 Madison County : 57 1017 : 3 Max. :2024 NA Max. :4400 Max. :279.1 Max. : 89.20 Max. :24.00 Max. :100.0 Max. :36.0
NA (Other):5808 (Other) :7772 (Other):8143 NA NA NA NA’s :1253 NA’s :1280 NA NA NA’s :1036
#separated df's
xkablesummary(housing_2024_all)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 217 Washington County: 27 1001 : 1 Min. :2024 all :2751 Min. : 17 Min. : 3.7 Min. :-28.70 Min. : 0.10 Min. : 0.00 Min. : 0.5
Q1 GA : 159 Jefferson County : 24 1003 : 1 1st Qu.:2024 entrylevel: 0 1st Qu.: 140 1st Qu.: 88.9 1st Qu.: 3.00 1st Qu.: 2.20 1st Qu.: 2.90 1st Qu.:11.4
Median KY : 114 Franklin County : 21 1005 : 1 Median :2024 moveup : 0 Median : 200 Median :107.3 Median : 5.95 Median : 3.00 Median : 6.40 Median :14.5
Mean VA : 107 Lincoln County : 20 1007 : 1 Mean :2024 NA Mean : 242 Mean :113.4 Mean : 6.00 Mean : 3.44 Mean : 9.51 Mean :14.8
Q3 NC : 100 Jackson County : 19 1009 : 1 3rd Qu.:2024 NA 3rd Qu.: 303 3rd Qu.:132.6 3rd Qu.: 8.60 3rd Qu.: 4.10 3rd Qu.:12.50 3rd Qu.:17.6
Max IA : 99 Madison County : 19 1017 : 1 Max. :2024 NA Max. :2300 Max. :278.0 Max. : 89.20 Max. :24.00 Max. :75.40 Max. :36.0
NA (Other):1955 (Other) :2621 (Other):2745 NA NA NA NA’s :293 NA’s :301 NA NA NA’s :323
xkablesummary(housing_2024_tiers)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 403 Washington County: 53 1001 : 2 Min. :2024 all : 0 Min. : 14 Min. : 4.5 Min. :-32.90 Min. : 0.00 Min. : 0.0 Min. : 0.2
Q1 GA : 318 Jefferson County : 48 1003 : 2 1st Qu.:2024 entrylevel:2741 1st Qu.: 155 1st Qu.: 85.4 1st Qu.: 2.70 1st Qu.: 2.10 1st Qu.: 2.6 1st Qu.: 9.7
Median KY : 226 Franklin County : 42 1005 : 2 Median :2024 moveup :2669 Median : 314 Median :104.4 Median : 5.60 Median : 3.30 Median : 7.3 Median :13.6
Mean VA : 212 Lincoln County : 40 1007 : 2 Mean :2024 NA Mean : 319 Mean :109.2 Mean : 5.82 Mean : 4.45 Mean : 11.3 Mean :14.0
Q3 NC : 200 Jackson County : 38 1009 : 2 3rd Qu.:2024 NA 3rd Qu.: 419 3rd Qu.:128.3 3rd Qu.: 8.70 3rd Qu.: 5.60 3rd Qu.: 16.1 3rd Qu.:17.5
Max IA : 198 Madison County : 38 1017 : 2 Max. :2024 NA Max. :4400 Max. :279.1 Max. : 86.80 Max. :24.00 Max. :100.0 Max. :36.0
NA (Other):3853 (Other) :5151 (Other):5398 NA NA NA NA’s :960 NA’s :979 NA NA NA’s :713

boxplot of median sale price by state

#boxplotting med sale price by state w/ ..._all df
#all
ggplot(housing_2024_all, aes(x = reorder(State, -Median_Sale_Price_per_k, median), 
                         y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Distribution of Median Sale Prices by State (2024)",
    x = "State",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

There are too many to be particularly useful. You can see general trends, but I am going to run this with a smaller state sample.

boxplot of top and bottom 5 states by housing count

The states broken out by housing count extremes are:

####Top Months Supply States
HI | Hawaii DC | District of Columbia FL | Florida MT | Montana CO | Colorado

####Bottom Months Supply States MA| Massachusetts WI | Wisconsin KY | Kentucky NH | New Hampshire IN | Indiana ND | North Dakota OH | Ohio

#boxplotting with new df: ..._all
#Find top/bottom 5 by *mean Months Supply*, not row count
states_by_mean_months_supply <- housing_2024_all %>%
  group_by(State) %>%
  summarize(Mean_Months_Supply = mean(Months_Supply))

top_states_all = states_by_mean_months_supply %>% slice_max(order_by = Mean_Months_Supply, n = 5)

bottom_states_all <- states_by_mean_months_supply %>% slice_min(order_by = Mean_Months_Supply, n = 5)

#merge top and bottom states
housing_compare_all <- housing_2024_all %>%
  filter(State %in% c(top_states_all$State, bottom_states_all$State)) %>%
  mutate(StateGroup = case_when(
    State %in% top_states_all$State ~ "High Months Supply",
    State %in% bottom_states_all$State ~ "Low Months Supply")) %>%
  mutate(StateGroup = factor(StateGroup, levels = c("Low Months Supply", "High Months Supply")))

#plot top bottom comparison ####
ggplot(housing_compare_all, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
  geom_boxplot() +
  facet_wrap(~ StateGroup, scales = "free_x") +
  labs(
    title = "Mean Sale Price in States by Months Supply of Housing",
    subtitle = "The Mean Sale Price in States with a Larger Months Supply of Housing is Significantly Lower\nthan States with a Smaller Months Supply of Housing",
    x = "State",
    y = "Mean Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal() +
  scale_fill_manual(values = c("High Months Supply" = "skyblue", "Low Months Supply" = "red"))

Significance Testing

#states_mean_sale_price_anova = aov(data = housing_compare_all)
# compare color scale w/ ..._all df
#fix color assigning for side by side compare scatterplots- too many blue showing up
#cleaned plot version with dropped unused (non-10) state levels; re-group by state for clarity
housing_compare_all_plot <- housing_compare_all %>%
  group_by(State, StateGroup) %>%
  summarize(
    Median_Sale_Price_per_k = median(Median_Sale_Price_per_k, na.rm = TRUE),
    Months_Supply = median(Months_Supply, na.rm = TRUE),
    .groups = "drop") %>%
  mutate(State = forcats::fct_drop(State))

#also fixing subset of housing 2024 all so that gray background dots are 1/state
housing_2024_all_plot <- housing_2024_all %>%
  group_by(State) %>%
  summarize(
    Median_Sale_Price_per_k = median(Median_Sale_Price_per_k, na.rm = TRUE),
    Months_Supply = median(Months_Supply, na.rm = TRUE),
    .groups = "drop")

#label points for 10 states 
label_points_all <- housing_compare_all_plot

#usig cleaned version of plot data to assign colors
top_states_colors_all <- scales::seq_gradient_pal("lightblue", "darkslateblue")(seq(0, 1, length.out = length(unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "High Months Supply"]))))
names(top_states_colors_all) <- unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "High Months Supply"])

bottom_states_colors_all <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "Low Months Supply"]))))
names(bottom_states_colors_all) <- unique(housing_compare_all_plot$State[housing_compare_all_plot$StateGroup == "Low Months Supply"])

state_colors_all <- c(top_states_colors_all, bottom_states_colors_all)

#all in gray with faceted compare in color with states labeled ####
ggplot() +
  geom_point(data = housing_2024_all_plot, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
             color = "gray70", alpha = 0.3, size = 1.5) +
  geom_point(data = housing_compare_all_plot,
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             size = 1.5, alpha = 0.9) +
  geom_text_repel(data = label_points_all,
                  aes(x = Months_Supply, y = Median_Sale_Price_per_k, 
                      label = State),
                  size = 3.5, color= "black", stroke=0.01, segment.color = NA, segment.size = 0.3,
                  segment.alpha = 1, min.segment.length = 0, show.legend = FALSE) +
  facet_wrap(~ StateGroup) +
  scale_color_manual(values = state_colors_all) +
  labs(
    title = "Months Supply vs Median Price by State",
    subtitle = "The association between Low Months Supply and High Median Sale Price is Immediately Visible, While the",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

#boxplotting med sale price by state w/ ..._compare_all df
#all
ggplot(housing_compare_all, aes(x = reorder(State, -Median_Sale_Price_per_k, median), 
                         y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = state_colors_all, alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Distribution of Median Sale Prices by State (2024)",
    x = "State",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

hist of median sale price

#hist plotting with df ..._all
ggplot(housing_2024_all, aes(x = Median_Sale_Price_per_k)) +
  geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
  labs(
    title = "Distribution of Median Sale Prices (2024)",
    x = "Median Sale Price (in thousands)",
    y = "Count",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me

boxplot of sale price by affordability tier

ggplot(housing_2024, aes(x = Affordability, y = Median_Sale_Price_per_k, fill = Affordability)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Affordability Tier (2024)",
    x = "Affordability Tier",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

#no duh, there are more unpurchased expensive houses because people can't afford it 
#not sure how useful this is, but maybe as a starting baseline 

scatterplot of months supply vs median price

ggplot(housing_2024_all_plot, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
  geom_point(color="steelblue", alpha = 0.7) +
  # geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Months Supply of Housing vs Median Sale Price",
    subtitle= "A Scatterplot Reveals a Positive Correlation Between Months Supply\nof Housing and Median Sale Price",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

#next round, add line of best fit/ regression line

correlation heatmap of numeric variables

#correlation heatmap of numeric variables w/ ..._all df
housing_numeric_all <- housing_2024_all %>%
  select(where(is.numeric) & !all_of("Year")) %>%
  drop_na()

cor_matrix_all <- cor(housing_numeric_all)

corrplot(cor_matrix_all, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)

#interesting to see positive and inverse relationships between variables 
#did not scan for no relationships

both together in gray with compare in color with states labeled

#both together in gray with compare in color with states labeled w/ ..._all df
ggplot() +
  geom_point(data = housing_2024_all, 
             aes(x = Months_Supply, y = Median_Sale_Price_per_k), 
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Most Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare_all %>% filter(StateGroup == "States with the Fewest Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  scale_color_manual(values = state_colors_all) +
  labs(
    title = "Housing Supply vs Median Price: All Counties with Highlights",
    subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

new construction vs median sale price

housing_constr = housing_2024
housing_constr = housing_constr %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(housing_constr, aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?
#I would like to know which states fall into these buckets. In which states are the top quarter and bottom quarter? BW
#I'm going to break this into smaller buckets and investigate.BW
#new construction vs median sale price w/ ..._all df
housing_constr_all = housing_2024_all
housing_constr_all = housing_constr_all %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(housing_constr_all, aes(x = constr_bins, y = Median_Sale_Price_per_k, fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?

Limit to Entry-Level houses

housing_constr_tiers = housing_2024_tiers
housing_constr_tiers = housing_constr_tiers %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(filter(housing_constr,Affordability=="entrylevel"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

# Even more pronounced here

Limit to Moveup houses

ggplot(filter(housing_constr,Affordability=="moveup"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

# Even more pronounced here

###mapping US counties by median sale price

##basemap

#reading in counties spatial file from tigris package
counties_2024 <- counties(year = 2013, 
                          cb = TRUE, 
                          class = "sf")
##   |                                                                              |                                                                      |   0%  |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=======                                                               |  11%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |=========                                                             |  14%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=====================                                                 |  31%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |=======================                                               |  34%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |==============================                                        |  44%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |===================================                                   |  51%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |==========================================                            |  61%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |===================================================                   |  74%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |=================================================================     |  94%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================|  99%  |                                                                              |======================================================================| 100%
#excluding territories and shifting AK/HI
counties_2024 <- counties_2024 %>%
  filter(as.numeric(STATEFP) < 60) %>%
  shift_geometry()

#removing duplicate columns
counties_2024 <- counties_2024 %>%
  select(-NAME)

#creating basemap and checking
counties_2024_basemap <- ggplot() +
  geom_sf(data = counties_2024) +
  theme_void()

counties_2024_basemap

##prepping and merging housing_2014_all w/ counties_2014

housing_2024_all <- housing_2024_all %>%
  mutate(price_bin = cut(Median_Sale_Price_per_k,
                         breaks = c(0, 150, 300, 500, 750, 1000, Inf),
                         labels = c("<150k", "150-300k", "300-500k", "500-750k", "750k-1M", ">1M")))

#creating merge df and prepping counties df
housing_merge <- housing_2024_all

counties_2024 <- counties_2024 %>%
  mutate(GEOID = as.numeric(GEOID))

#adding id variables to track observations
counties_2024$id1 <- 1
housing_merge$id2 <- 1

#merging
housing_counties_merge <- merge(x = counties_2024,
                                y = housing_merge,
                                by.x = "GEOID",
                                by.y = "FIPS_County_Code",
                                all = TRUE)

#converting NAs to zeros and finding problem observations
  #looks like it merged just fine
    #i lied, coded wrong and fixed, now i realize theres a  
    #lot of problem observations 
housing_counties_merge <- housing_counties_merge %>%
  mutate(id1 = ifelse(is.na(id1), 0, id1),
         id2 = ifelse(is.na(id2), 0, id2))

no_merge <- housing_counties_merge %>%
  filter(id1 + id2 != 2)

unique(no_merge$GEOID)
##   [1]  1011  1013  1015  1023  1035  1041  1049  1055  1063  1071  1075  1079
##  [13]  1081  1087  1093  1105  1107  1123  2013  2016  2068  2070  2105  2164
##  [25]  2170  2185  2188  2198  2240  2261  2270  2275  2282  2290  5049  5077
##  [37]  9013 15005 17003 17013 17045 17055 17061 17071 17123 17147 17155 17159
##  [49] 17169 17171 17185 20003 20005 20007 20009 20019 20021 20023 20025 20027
##  [61] 20029 20031 20033 20035 20039 20041 20043 20047 20049 20051 20053 20057
##  [73] 20059 20063 20065 20067 20071 20075 20077 20079 20081 20083 20085 20089
##  [85] 20093 20095 20097 20099 20101 20105 20107 20109 20113 20115 20117 20119
##  [97] 20123 20125 20127 20129 20131 20133 20135 20137 20139 20141 20143 20145
## [109] 20147 20151 20153 20155 20157 20159 20161 20163 20165 20167 20171 20175
## [121] 20179 20181 20183 20185 20187 20189 20191 20193 20195 20199 20201 20203
## [133] 20205 20207 21095 21099 21105 21127 21129 21133 22021 22023 22025 22029
## [145] 22035 22039 22041 22043 22067 22083 22091 22093 22095 22107 22123 26009
## [157] 26029 26063 26075 26091 26095 26097 26113 26133 26153 28001 28003 28005
## [169] 28007 28011 28015 28019 28021 28023 28027 28031 28037 28041 28043 28049
## [181] 28051 28053 28055 28057 28061 28063 28065 28067 28069 28077 28083 28095
## [193] 28097 28099 28107 28115 28119 28125 28131 28135 28141 28143 28145 28149
## [205] 28157 28159 28161 28163 29001 29003 29015 29017 29057 29065 29081 29083
## [217] 29085 29087 29089 29093 29103 29113 29129 29139 29171 29177 29197 29199
## [229] 29221 29227 30003 30005 30011 30015 30017 30019 30025 30033 30037 30041
## [241] 30065 30071 30079 30083 30085 30091 30095 30099 30103 30107 30109 31053
## [253] 31075 31115 31165 32009 35011 35017 35021 35023 35059 38085 42021 45067
## [265] 46003 46007 46009 46011 46013 46015 46017 46023 46025 46031 46035 46037
## [277] 46041 46045 46047 46049 46051 46053 46055 46057 46059 46061 46063 46069
## [289] 46071 46075 46077 46079 46081 46083 46085 46087 46089 46091 46093 46095
## [301] 46097 46101 46105 46107 46113 46117 46119 46121 46123 46125 46129 46137
## [313] 48017 48023 48033 48045 48081 48095 48101 48111 48117 48125 48155 48173
## [325] 48195 48207 48243 48247 48261 48267 48269 48271 48275 48301 48305 48311
## [337] 48327 48333 48341 48345 48371 48389 48393 48411 48417 48445 48447 48455
## [349] 48483 49009 49015 49017 49019 49023 49031 49037 49055 51005 51515 51520
## [361] 51530 51540 51570 51580 51595 51600 51610 51620 51630 51640 51660 51670
## [373] 51678 51683 51685 51690 51720 51750 51770 51775 51790 51820 51830 51840
## [385] 53019 53039 53075 54003 54039 54093 56017 56027
#plotting and checking
housing_counties_plot <- ggplot() +
  geom_sf(data = housing_counties_merge,
          mapping = aes(fill = Median_Sale_Price_per_k),
          color = "white",
          linewidth = .4) +
  #scale_fill_brewer(palette = "GnBi") +
  theme(legend.position = "none") +
  theme_void()

housing_counties_plot

#for fun: trying st_simplify to see if it looks better
housing_counties_merge_simple <- st_simplify(housing_counties_merge, dTolerance = 75)

housing_counties_plot2 <- ggplot() +
  geom_sf(data = housing_counties_merge_simple,
          mapping = aes(fill = price_bin),
          color = "white",
          linewidth = .4) +
  scale_fill_brewer(palette = "RdYlBu",
                    direction = -1,
                    name = "Median Sale Price",
                    na.value = "grey85") +
  labs(title = "Geographic Variation in U.S. Home Prices, 2024",
       subtitle = "County-level median sale price",
       caption = "Source: AEI Housing Center (2024)") +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", 
                                  size = 16),
        plot.subtitle = element_text(size = 12, 
                                     margin = margin(b = 8)),
        plot.caption = element_text(color = "gray40", 
                                    margin = margin(t = 8)),
        legend.title = element_text(face = "bold"),
        legend.position = "right",
        legend.key.height = unit(0.6, "cm"),
        panel.grid = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        plot.margin = margin(8, 12, 8, 12))
#+ theme_void()

housing_counties_plot2

####looks like there are just missing data for counties as they show up as white or gray in my map. guessing it has something to do with the removing na's code not working that alex and i were wondering about. will need to get to the bottom of this.
####i could of course leave the color gradient and show the concentration or i could bin ranges of prices to different colors on the gradient scale. perhaps by affordability or something else that's useful? want to convene with team.
#BW: I think bins and/or a more drastic color scale could help readability. Like- dark purple, medium pink, orangey-yellow would be immediately visibly different
####i plan to add useful titles, labels, and commentary where necessary when i work on this next.
#BW: when the regional differences are more clear, we need to think about the story this map tells and how it fits into our overall narrative. 
#hist of months supply

#hist plotting months supply with df ...top_states_all
ggplot(housing_compare_all, aes(x = reorder(State, Months_Supply, median), 
                         y = Months_Supply)) +
  geom_boxplot(fill = state_colors_all, alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Distribution of Months Supply (2024)",
    x = "State",
    y = "Months Supply",
    caption = "SOURCE: American Enterprise Institute") +
  theme_minimal()

#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me




#z score
#confidence interval (on what tho, maybe median sale price and months supply)
#then add Zscore and CI onto scatterplot somehow?
ggplot(housing_compare_all, 
       aes(x = reorder(State, Months_Supply, median), 
           y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = state_colors_all, alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Median Sale Price by State Ordered by Months of Supply (2024)",
    x = "State (ordered by median Months of Supply)",
    y = "Median Sale Price (in thousands)",
    caption = "SOURCE: American Enterprise Institute"
  ) +
  theme_minimal()